In [1]:
# Import the libraries required for exploration and preproccesing
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import seaborn as sns
from importlib import reload
import matplotlib.pyplot as plt
import matplotlib
import warnings
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['axes.facecolor'] = 'white'
# Configure Jupyter Notebook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.expand_frame_repr', False)
# pd.set_option('max_colwidth', -1)
display(HTML("<style>div.output_scroll { height: 35em; }</style>"))
reload(plt)
%matplotlib inline
%config InlineBackend.figure_format ='retina'
warnings.filterwarnings('ignore')
# configure plotly graph objects
pio.renderers.default = 'iframe'
# pio.renderers.default = 'vscode'
pio.templates["ck_template"] = go.layout.Template(
layout_colorway = px.colors.sequential.Viridis,
# layout_hovermode = 'closest',
# layout_hoverdistance = -1,
layout_autosize=False,
layout_width=800,
layout_height=600,
layout_font = dict(family="Calibri Light"),
layout_title_font = dict(family="Calibri"),
layout_hoverlabel_font = dict(family="Calibri Light"),
# plot_bgcolor="white",
)
# pio.templates.default = 'seaborn+ck_template+gridon'
pio.templates.default = 'ck_template+gridon'
# pio.templates.default = 'seaborn+gridon'
# pio.templates
In [2]:
# Give names to the features
index_names = ['engine', 'cycle']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names=[ "(Fan inlet temperature) (◦R)",
"(LPC outlet temperature) (◦R)",
"(HPC outlet temperature) (◦R)",
"(LPT outlet temperature) (◦R)",
"(Fan inlet Pressure) (psia)",
"(bypass-duct pressure) (psia)",
"(HPC outlet pressure) (psia)",
"(Physical fan speed) (rpm)",
"(Physical core speed) (rpm)",
"(Engine pressure ratio(P50/P2)",
"(HPC outlet Static pressure) (psia)",
"(Ratio of fuel flow to Ps30) (pps/psia)",
"(Corrected fan speed) (rpm)",
"(Corrected core speed) (rpm)",
"(Bypass Ratio) ",
"(Burner fuel-air ratio)",
"(Bleed Enthalpy)",
"(Required fan speed)",
"(Required fan conversion speed)",
"(High-pressure turbines Cool air flow)",
"(Low-pressure turbines Cool air flow)" ]
col_names = index_names + setting_names + sensor_names
# df_train = pd.read_csv(('./CMaps/train_FD001.txt'), sep='\s+', header=None, names=col_names)
# df_test = pd.read_csv(('./CMaps/test_FD001.txt'), sep='\s+', header=None, names=col_names)
# df_test_RUL = pd.read_csv(('./CMaps/RUL_FD001.txt'), sep='\s+', header=None, names=['RUL'])
df_train = pd.read_csv(('CMAPSSData/train_FD001.txt'), sep='\s+', header=None, names=col_names)
df_test = pd.read_csv(('CMAPSSData/test_FD001.txt'), sep='\s+', header=None, names=col_names)
df_test_RUL = pd.read_csv(('CMAPSSData/RUL_FD001.txt'), sep='\s+', header=None, names=['RUL'])
In [3]:
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20631 entries, 0 to 20630 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 engine 20631 non-null int64 1 cycle 20631 non-null int64 2 setting_1 20631 non-null float64 3 setting_2 20631 non-null float64 4 setting_3 20631 non-null float64 5 (Fan inlet temperature) (◦R) 20631 non-null float64 6 (LPC outlet temperature) (◦R) 20631 non-null float64 7 (HPC outlet temperature) (◦R) 20631 non-null float64 8 (LPT outlet temperature) (◦R) 20631 non-null float64 9 (Fan inlet Pressure) (psia) 20631 non-null float64 10 (bypass-duct pressure) (psia) 20631 non-null float64 11 (HPC outlet pressure) (psia) 20631 non-null float64 12 (Physical fan speed) (rpm) 20631 non-null float64 13 (Physical core speed) (rpm) 20631 non-null float64 14 (Engine pressure ratio(P50/P2) 20631 non-null float64 15 (HPC outlet Static pressure) (psia) 20631 non-null float64 16 (Ratio of fuel flow to Ps30) (pps/psia) 20631 non-null float64 17 (Corrected fan speed) (rpm) 20631 non-null float64 18 (Corrected core speed) (rpm) 20631 non-null float64 19 (Bypass Ratio) 20631 non-null float64 20 (Burner fuel-air ratio) 20631 non-null float64 21 (Bleed Enthalpy) 20631 non-null int64 22 (Required fan speed) 20631 non-null int64 23 (Required fan conversion speed) 20631 non-null float64 24 (High-pressure turbines Cool air flow) 20631 non-null float64 25 (Low-pressure turbines Cool air flow) 20631 non-null float64 dtypes: float64(22), int64(4) memory usage: 4.1 MB
In [4]:
df_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 13096 entries, 0 to 13095 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 engine 13096 non-null int64 1 cycle 13096 non-null int64 2 setting_1 13096 non-null float64 3 setting_2 13096 non-null float64 4 setting_3 13096 non-null float64 5 (Fan inlet temperature) (◦R) 13096 non-null float64 6 (LPC outlet temperature) (◦R) 13096 non-null float64 7 (HPC outlet temperature) (◦R) 13096 non-null float64 8 (LPT outlet temperature) (◦R) 13096 non-null float64 9 (Fan inlet Pressure) (psia) 13096 non-null float64 10 (bypass-duct pressure) (psia) 13096 non-null float64 11 (HPC outlet pressure) (psia) 13096 non-null float64 12 (Physical fan speed) (rpm) 13096 non-null float64 13 (Physical core speed) (rpm) 13096 non-null float64 14 (Engine pressure ratio(P50/P2) 13096 non-null float64 15 (HPC outlet Static pressure) (psia) 13096 non-null float64 16 (Ratio of fuel flow to Ps30) (pps/psia) 13096 non-null float64 17 (Corrected fan speed) (rpm) 13096 non-null float64 18 (Corrected core speed) (rpm) 13096 non-null float64 19 (Bypass Ratio) 13096 non-null float64 20 (Burner fuel-air ratio) 13096 non-null float64 21 (Bleed Enthalpy) 13096 non-null int64 22 (Required fan speed) 13096 non-null int64 23 (Required fan conversion speed) 13096 non-null float64 24 (High-pressure turbines Cool air flow) 13096 non-null float64 25 (Low-pressure turbines Cool air flow) 13096 non-null float64 dtypes: float64(22), int64(4) memory usage: 2.6 MB
In [5]:
df_test_RUL.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RUL 100 non-null int64 dtypes: int64(1) memory usage: 928.0 bytes
In [6]:
df_train.describe(include='all').T
Out[6]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| engine | 20631.0 | 51.506568 | 2.922763e+01 | 1.0000 | 26.0000 | 52.0000 | 77.0000 | 100.0000 |
| cycle | 20631.0 | 108.807862 | 6.888099e+01 | 1.0000 | 52.0000 | 104.0000 | 156.0000 | 362.0000 |
| setting_1 | 20631.0 | -0.000009 | 2.187313e-03 | -0.0087 | -0.0015 | 0.0000 | 0.0015 | 0.0087 |
| setting_2 | 20631.0 | 0.000002 | 2.930621e-04 | -0.0006 | -0.0002 | 0.0000 | 0.0003 | 0.0006 |
| setting_3 | 20631.0 | 100.000000 | 0.000000e+00 | 100.0000 | 100.0000 | 100.0000 | 100.0000 | 100.0000 |
| (Fan inlet temperature) (◦R) | 20631.0 | 518.670000 | 0.000000e+00 | 518.6700 | 518.6700 | 518.6700 | 518.6700 | 518.6700 |
| (LPC outlet temperature) (◦R) | 20631.0 | 642.680934 | 5.000533e-01 | 641.2100 | 642.3250 | 642.6400 | 643.0000 | 644.5300 |
| (HPC outlet temperature) (◦R) | 20631.0 | 1590.523119 | 6.131150e+00 | 1571.0400 | 1586.2600 | 1590.1000 | 1594.3800 | 1616.9100 |
| (LPT outlet temperature) (◦R) | 20631.0 | 1408.933782 | 9.000605e+00 | 1382.2500 | 1402.3600 | 1408.0400 | 1414.5550 | 1441.4900 |
| (Fan inlet Pressure) (psia) | 20631.0 | 14.620000 | 1.776400e-15 | 14.6200 | 14.6200 | 14.6200 | 14.6200 | 14.6200 |
| (bypass-duct pressure) (psia) | 20631.0 | 21.609803 | 1.388985e-03 | 21.6000 | 21.6100 | 21.6100 | 21.6100 | 21.6100 |
| (HPC outlet pressure) (psia) | 20631.0 | 553.367711 | 8.850923e-01 | 549.8500 | 552.8100 | 553.4400 | 554.0100 | 556.0600 |
| (Physical fan speed) (rpm) | 20631.0 | 2388.096652 | 7.098548e-02 | 2387.9000 | 2388.0500 | 2388.0900 | 2388.1400 | 2388.5600 |
| (Physical core speed) (rpm) | 20631.0 | 9065.242941 | 2.208288e+01 | 9021.7300 | 9053.1000 | 9060.6600 | 9069.4200 | 9244.5900 |
| (Engine pressure ratio(P50/P2) | 20631.0 | 1.300000 | 0.000000e+00 | 1.3000 | 1.3000 | 1.3000 | 1.3000 | 1.3000 |
| (HPC outlet Static pressure) (psia) | 20631.0 | 47.541168 | 2.670874e-01 | 46.8500 | 47.3500 | 47.5100 | 47.7000 | 48.5300 |
| (Ratio of fuel flow to Ps30) (pps/psia) | 20631.0 | 521.413470 | 7.375534e-01 | 518.6900 | 520.9600 | 521.4800 | 521.9500 | 523.3800 |
| (Corrected fan speed) (rpm) | 20631.0 | 2388.096152 | 7.191892e-02 | 2387.8800 | 2388.0400 | 2388.0900 | 2388.1400 | 2388.5600 |
| (Corrected core speed) (rpm) | 20631.0 | 8143.752722 | 1.907618e+01 | 8099.9400 | 8133.2450 | 8140.5400 | 8148.3100 | 8293.7200 |
| (Bypass Ratio) | 20631.0 | 8.442146 | 3.750504e-02 | 8.3249 | 8.4149 | 8.4389 | 8.4656 | 8.5848 |
| (Burner fuel-air ratio) | 20631.0 | 0.030000 | 1.387812e-17 | 0.0300 | 0.0300 | 0.0300 | 0.0300 | 0.0300 |
| (Bleed Enthalpy) | 20631.0 | 393.210654 | 1.548763e+00 | 388.0000 | 392.0000 | 393.0000 | 394.0000 | 400.0000 |
| (Required fan speed) | 20631.0 | 2388.000000 | 0.000000e+00 | 2388.0000 | 2388.0000 | 2388.0000 | 2388.0000 | 2388.0000 |
| (Required fan conversion speed) | 20631.0 | 100.000000 | 0.000000e+00 | 100.0000 | 100.0000 | 100.0000 | 100.0000 | 100.0000 |
| (High-pressure turbines Cool air flow) | 20631.0 | 38.816271 | 1.807464e-01 | 38.1400 | 38.7000 | 38.8300 | 38.9500 | 39.4300 |
| (Low-pressure turbines Cool air flow) | 20631.0 | 23.289705 | 1.082509e-01 | 22.8942 | 23.2218 | 23.2979 | 23.3668 | 23.6184 |
In [7]:
plt.figure(figsize=(10,10))
threshold = 0.90
sns.set_style("whitegrid", {"axes.facecolor": ".0"})
df_cluster2 = df_train.corr()
mask = df_cluster2.where((abs(df_cluster2) >= threshold)).isna()
plot_kws={"s": 1}
sns.heatmap(df_cluster2,
cmap='RdYlBu',
annot=True,
mask=mask,
linewidths=0.2,
linecolor='lightgrey').set_facecolor('white')
In [8]:
from ydata_profiling import ProfileReport
In [9]:
%%time
profile = ProfileReport(df_train,
title="Predictive Maintenance",
dataset={"description": "This profiling report was generated for Janet Cheung",
"copyright_holder": "Janet Cheung",
"copyright_year": "2024",
},
explorative=True,
)
profile
CPU times: total: 15.6 ms Wall time: 365 ms
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]